# mpg data frame is a rectangular collection (fields and records)
# found in (ggplot2::mpg)
# displ - car’s engine size, in liters
# hwy - a car’s fuel efficiency on the highway
mpg
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manu… f 21 29 p comp…
## 3 audi a4 2 2008 4 manu… f 20 31 p comp…
## 4 audi a4 2 2008 4 auto… f 21 30 p comp…
## 5 audi a4 2.8 1999 6 auto… f 16 26 p comp…
## 6 audi a4 2.8 1999 6 manu… f 18 26 p comp…
## 7 audi a4 3.1 2008 6 auto… f 18 27 p comp…
## 8 audi a4 q… 1.8 1999 4 manu… 4 18 26 p comp…
## 9 audi a4 q… 1.8 1999 4 auto… 4 16 25 p comp…
## 10 audi a4 q… 2 2008 4 manu… 4 20 28 p comp…
## # … with 224 more rows
# Help
help(mpg)
# Creating a ggplot2
## Function geom_point() adds a layer of points to your plot,
## which creates a scatterplot
ggplot(data=mpg) + geom_point(mapping=aes(x=displ,y=hwy))
# Graphic Template
# ggplot(data=<DATA>) + <GEOM_FUNCTION>(mapping=aes(<MAPPINGS>))
ggplot(data = mpg)
# Import mtcars dataset
mtcars <- dput(mtcars)
## structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
## 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
## 30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
## 19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
## 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
## disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
## 167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
## 71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
## 301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
## 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
## 150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
## 3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
## 3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
## 3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
## ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
## 3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
## 1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
## 1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
## 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
## 18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
## 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
## ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
## 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
## 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
## 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
## 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
## 3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
## 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
## 2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag",
## "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
## "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
## "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
## "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
## "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
## "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
## "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
## "Volvo 142E"), class = "data.frame")
nrow(mtcars)
## [1] 32
ncol(mtcars)
## [1] 11
ggplot(data=mpg) + geom_point(mapping=aes(x=cyl,y=hwy))
# Why is it not an useful graph?
ggplot(data=mpg) + geom_point(mapping=aes(x=class,y=drv))
# Color advised for discrete variables
p = ggplot(data = mpg)
p + geom_point(mapping = aes(x = displ, y = hwy, color = class))
# Warning: Using size for a discrete variable is not advised.
# Size is advised for continuous variables
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.
# Top
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.
# Bottom
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
# Make all plotted points blue
# stroke aesthetic
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy), color = "blue", stroke=5)
# Facets: subplots that each display one subset of the data
ggplot(data=mpg) +
geom_point(mapping = aes(x=displ, y=hwy)) +
facet_wrap(~class, nrow=2)
# Facet the plot on the combination of two variables
# Generally those variables are discrete (drv and cyl in this case)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ .)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(. ~ cyl)
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
# For example, bar charts use bar geoms, line charts use line geoms, boxplots use boxplot geoms, and so on. Scatterplots break the trend; they use the point geom.
# left
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy))
# right
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Different linetype, for each unique value of the variable
ggplot(data = mpg) +
geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# ggplot2 provides over 30 geoms
ggplot(data=mpg) +
geom_smooth(mapping=aes(x=displ, y=hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg) +
geom_smooth(mapping=aes(x = displ, y=hwy, group=drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(data=mpg) +
geom_smooth(mapping=aes(x=displ, y=hwy, color=drv), show.legend=TRUE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Display multiple geoms in the same plot
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Imagine if you wanted to change the y-axis to display cty instead of hwy. You’d need to change the variable in two places, and you might forget to update one. You can avoid this type of repetition by passing a set of mappings to ggplot().
# Global mapping, by mapping in ggplot(...) method
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Local mappings to extend or overwrite the global mappings for that layer only.
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Smooth line displays just a subset of the mpg dataset
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point(mapping = aes(color = class)) +
geom_smooth(
data = filter(mpg, class == "subcompact"), se = FALSE )
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Plot exactly the same
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot() +
geom_point(
data = mpg,
mapping = aes(x = displ, y = hwy) )+
geom_smooth(
data = mpg,
mapping = aes(x = displ, y = hwy)
)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
stat: statistical transformation
# Diamonds dataset comes in ggplot2 and contains information about ~54,000 diamonds, including the price, carat, color, clarity, and cut of each diamond
# Bar charts seem simple, but they are interesting because they reveal something subtle about plots.
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
#you can re-create the previous plot using stat_count() instead of geom_bar():
ggplot(data = diamonds) +
stat_count(mapping = aes(x = cut))
# Deep a bit in how bar charts are built
demo <- tribble(
~a, ~b,
"bar_1", 20,
"bar_2", 30,
"bar_3", 40
)
ggplot(data = demo) +
geom_bar(
mapping = aes(x = a, y = b), stat = "identity"
)
# Proportion bar chart
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
# Summarizes the y values for each unique x value
ggplot(data = diamonds) +
stat_summary(
mapping = aes(x = cut, y = depth),
fun.ymin = min,
fun.ymax = max,
fun.y = median
)
# WITHOUT GROUP
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop..))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))
# WITH GROUP
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = color, y = ..prop.., group = 1))
ggplot(data = diamonds ) +
geom_bar(mapping = aes(x = cut, color = cut))
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = cut))
# Map the fill aesthetic to another vari‐ able, like clarity: the bars are automatically stacked.
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = clarity))
# position = "identity" will place each object exactly where it falls in the context of the graph. It's more useful for 2D geoms, like points, where it is the default.
ggplot(
data = diamonds,
mapping = aes(x = cut, fill = clarity)
) +
geom_bar(alpha = 1/5, position = "identity")
ggplot(
data = diamonds,
mapping = aes(x = cut, color = clarity)
) +
geom_bar(fill = NA, position = "identity")
# position = "fill"; Useful to compare proportions
ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = clarity),
position = "fill"
)
# position = "dodge"; It Makes it easier to compare individual values
ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = clarity),
position = "dodge"
)
# position = "jitter"; It adds a small amount of random noise to each point. Let us see where there is more density.
ggplot(data = mpg) +
geom_point(
mapping = aes(x = displ, y = hwy),
position = "jitter"
)
# geom_jitter() - it adds a small amount of random variation to the location of each point
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point() + geom_jitter()
# geom_count() - it counts the number of observations at each location; useful for discrete data and overplotting
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point() + geom_count()
# Vertical box plot
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
# Horizontal box plot through axis flipping
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot() +
coord_flip()
# Print a map through coordinates
nz <- map_data("nz")
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", color = "black")
# coord_quickmap() - sets the aspect ratio correctly for maps
ggplot(nz, aes(long, lat, group = group)) +
geom_polygon(fill = "white", color = "black") +
coord_quickmap()
# labs - Modify axis, legend, and plot labels
bar <- ggplot(data = diamonds) +
geom_bar(
mapping = aes(x = cut, fill = cut), show.legend = FALSE,
width = 1
) +
theme(aspect.ratio = 1) +
labs(x = NULL, y = NULL)
bar + coord_flip()
bar + coord_polar()
# coord_fixed() - forces a specific ratio between the physical representation of data units on the axes
# geom_abline() - Add a reference line specified by the slope and intercept
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_point() +
geom_abline() +
coord_fixed()
# Function Template
# ggplot(data = <DATA>) +
# <GEOM_FUNCTION>(
# mapping = aes(<MAPPINGS>),
# stat = <STAT>,
# position = <POSITION>
# )+ <COORDINATE_FUNCTION> + <FACET_FUNCTION>
# Read the documentation using the ?<expression>
?mpg
# List the variables
names(mpg)
## [1] "manufacturer" "model" "displ" "year"
## [5] "cyl" "trans" "drv" "cty"
## [9] "hwy" "fl" "class"
# List the dimensions
dim(mpg)
## [1] 234 11
# Class of the object
class(mpg)
## [1] "tbl_df" "tbl" "data.frame"
# print first 2 rows of the dataframe
head(mpg,2)
## # A tibble: 2 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(… f 18 29 p comp…
## 2 audi a4 1.8 1999 4 manua… f 21 29 p comp…
# print last 2 rows of the dataframe
tail(mpg,2)
## # A tibble: 2 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 volkswagen passat 2.8 1999 6 manu… f 18 26 p mids…
## 2 volkswagen passat 3.6 2008 6 auto… f 17 26 p mids…
summary(mpg)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
library(skimr)
## Warning: package 'skimr' was built under R version 3.5.2
##
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
##
## filter
skim(mpg)
## Skim summary statistics
## n obs: 234
## n variables: 11
##
## ── Variable type:character ──────────────────────────────────────────
## variable missing complete n min max empty n_unique
## class 0 234 234 3 10 0 7
## drv 0 234 234 1 1 0 3
## fl 0 234 234 1 1 0 5
## manufacturer 0 234 234 4 10 0 15
## model 0 234 234 2 22 0 38
## trans 0 234 234 8 10 0 10
##
## ── Variable type:integer ────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## cty 0 234 234 16.86 4.26 9 14 17 19 35
## cyl 0 234 234 5.89 1.61 4 4 6 8 8
## hwy 0 234 234 23.44 5.95 12 18 24 27 44
## year 0 234 234 2003.5 4.51 1999 1999 2003.5 2008 2008
## hist
## ▅▇▇▇▁▁▁▁
## ▇▁▁▇▁▁▁▇
## ▃▇▃▇▅▁▁▁
## ▇▁▁▁▁▁▁▇
##
## ── Variable type:numeric ────────────────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100 hist
## displ 0 234 234 3.47 1.29 1.6 2.4 3.3 4.6 7 ▇▇▅▅▅▃▂▁